rm(list = ls())
library(Lahman)
package 㤼㸱Lahman㤼㸲 was built under R version 4.0.5
library(mosaic)
Registered S3 method overwritten by 'mosaic':
method from
fortify.SpatialPolygonsDataFrame ggplot2
The 'mosaic' package masks several functions from core packages in order to add
additional features. The original behavior of these functions should not be affected by this.
Attaching package: 㤼㸱mosaic㤼㸲
The following objects are masked from 㤼㸱package:dplyr㤼㸲:
count, do, tally
The following object is masked from 㤼㸱package:Matrix㤼㸲:
mean
The following object is masked from 㤼㸱package:ggplot2㤼㸲:
stat
The following objects are masked from 㤼㸱package:stats㤼㸲:
binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test, quantile, sd, t.test, var
The following objects are masked from 㤼㸱package:base㤼㸲:
max, mean, min, prod, range, sample, sum
library(tidyr)
package 㤼㸱tidyr㤼㸲 was built under R version 4.0.5
Attaching package: 㤼㸱tidyr㤼㸲
The following objects are masked from 㤼㸱package:Matrix㤼㸲:
expand, pack, unpack
library(tidyverse)
-- Attaching packages ------------------------------------------------------------------------------- tidyverse 1.3.0 --
v tibble 3.1.0 v stringr 1.4.0
v readr 1.4.0 v forcats 0.5.1
v purrr 0.3.4
package 㤼㸱tibble㤼㸲 was built under R version 4.0.5package 㤼㸱forcats㤼㸲 was built under R version 4.0.5-- Conflicts ---------------------------------------------------------------------------------- tidyverse_conflicts() --
x mosaic::count() masks dplyr::count()
x purrr::cross() masks mosaic::cross()
x mosaic::do() masks dplyr::do()
x tidyr::expand() masks Matrix::expand()
x dplyr::filter() masks stats::filter()
x ggstance::geom_errorbarh() masks ggplot2::geom_errorbarh()
x dplyr::lag() masks stats::lag()
x tidyr::pack() masks Matrix::pack()
x mosaic::stat() masks ggplot2::stat()
x mosaic::tally() masks dplyr::tally()
x tidyr::unpack() masks Matrix::unpack()
library(dplyr)
library(mplot)
package 㤼㸱mplot㤼㸲 was built under R version 4.0.5
Attaching package: 㤼㸱mplot㤼㸲
The following object is masked from 㤼㸱package:mosaic㤼㸲:
mplot
library(ggplot2)
library(cluster)
library(factoextra)
package 㤼㸱factoextra㤼㸲 was built under R version 4.0.5Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(corrplot)
corrplot 0.92 loaded
library(data.table)
package 㤼㸱data.table㤼㸲 was built under R version 4.0.5data.table 1.14.0 using 4 threads (see ?getDTthreads). Latest news: r-datatable.com
Attaching package: 㤼㸱data.table㤼㸲
The following object is masked from 㤼㸱package:purrr㤼㸲:
transpose
The following objects are masked from 㤼㸱package:dplyr㤼㸲:
between, first, last
library(mod)
package 㤼㸱mod㤼㸲 was built under R version 4.0.5
Attaching package: 㤼㸱mod㤼㸲
The following object is masked from 㤼㸱package:Matrix㤼㸲:
drop
The following object is masked from 㤼㸱package:base㤼㸲:
drop
library(modelr)
Attaching package: 㤼㸱modelr㤼㸲
The following object is masked from 㤼㸱package:mosaic㤼㸲:
resample
The following object is masked from 㤼㸱package:ggformula㤼㸲:
na.warn
library(leaps)
package 㤼㸱leaps㤼㸲 was built under R version 4.0.5
library(caret)
package 㤼㸱caret㤼㸲 was built under R version 4.0.5
Attaching package: 㤼㸱caret㤼㸲
The following object is masked from 㤼㸱package:purrr㤼㸲:
lift
The following object is masked from 㤼㸱package:mosaic㤼㸲:
dotPlot
library(ISLR2)
package 㤼㸱ISLR2㤼㸲 was built under R version 4.0.5
library(glmnet)
package 㤼㸱glmnet㤼㸲 was built under R version 4.0.5Loaded glmnet 4.1-3
=======
rm(list = ls())
library(Lahman)
library(mosaic)
library(tidyr)
library(tidyverse)
library(dplyr)
library(mplot)
library(ggplot2)
library(cluster)
library(factoextra)
library(corrplot)
library(data.table)
library(mod)
library(modelr)
library(leaps)
library(caret)
library(ISLR2)
library(glmnet)
>>>>>>> 0a71bd239790e75b2d56eb2aeea95cd144536745
#Load in People, Batting, and Pitching Dataframes
data("People")
data("Batting")
data("Pitching")
#Merges player name to Batting data.
bstats <- battingStats()
str(bstats)
'data.frame': 108789 obs. of 29 variables:
$ playerID: chr "abercda01" "addybo01" "allisar01" "allisdo01" ...
$ yearID : int 1871 1871 1871 1871 1871 1871 1871 1871 1871 1871 ...
$ stint : int 1 1 1 1 1 1 1 1 1 1 ...
$ teamID : Factor w/ 149 levels "ALT","ANA","ARI",..: 136 111 39 142 111 56 111 24 56 24 ...
$ lgID : Factor w/ 7 levels "AA","AL","FL",..: 4 4 4 4 4 4 4 4 4 4 ...
$ G : int 1 25 29 27 25 12 1 31 1 18 ...
$ AB : int 4 118 137 133 120 49 4 157 5 86 ...
$ R : int 0 30 28 28 29 9 0 66 1 13 ...
$ H : int 0 32 40 44 39 11 1 63 1 13 ...
$ X2B : int 0 6 4 10 11 2 0 10 1 2 ...
$ X3B : int 0 0 5 2 3 1 0 9 0 1 ...
$ HR : int 0 0 0 2 0 0 0 0 0 0 ...
$ RBI : int 0 13 19 27 16 5 2 34 1 11 ...
$ SB : int 0 8 3 1 6 0 0 11 0 1 ...
$ CS : int 0 1 1 1 2 1 0 6 0 0 ...
$ BB : int 0 4 2 0 2 0 1 13 0 0 ...
$ SO : int 0 0 5 2 1 1 0 1 0 0 ...
$ IBB : int NA NA NA NA NA NA NA NA NA NA ...
$ HBP : int NA NA NA NA NA NA NA NA NA NA ...
$ SH : int NA NA NA NA NA NA NA NA NA NA ...
$ SF : int NA NA NA NA NA NA NA NA NA NA ...
$ GIDP : int 0 0 1 0 0 0 0 1 0 0 ...
$ BA : num 0 0.271 0.292 0.331 0.325 0.224 0.25 0.401 0.2 0.151 ...
$ PA : num 4 122 139 133 122 49 5 170 5 86 ...
$ TB : num 0 38 54 64 56 15 1 91 2 17 ...
$ SlugPct : num 0 0.322 0.394 0.481 0.467 0.306 0.25 0.58 0.4 0.198 ...
$ OBP : num 0 0.295 0.302 0.331 0.336 0.224 0.4 0.447 0.2 0.151 ...
$ OPS : num 0 0.617 0.696 0.812 0.803 ...
$ BABIP : num 0 0.271 0.303 0.326 0.328 0.229 0.25 0.404 0.2 0.151 ...
People$name <- paste(People$nameFirst, People$nameLast, sep = " ")
batting_name <- merge(Batting,
People[,c("playerID", "name")],
by = "playerID", all.x = TRUE)
#Merges player name to Pitching data.
People$name <- paste(People$nameFirst, People$nameLast, sep = " ")
pitching_name <- merge(Pitching,
People[,c("playerID", "name")],
by = "playerID", all.x = TRUE)
#Creating additional stats for bstats
bstats[is.na(bstats)] = 0
#is.nan(bstats)
bstats <- bstats %>%
mutate(K_Percent = SO / PA) %>%
mutate(BB_Percent = (BB + IBB) / PA) %>%
mutate_all(~replace(., is.nan(.), 0))
invalid factor level, NA generatedinvalid factor level, NA generated
bstats <- bstats %>%
mutate_at(vars(K_Percent, BB_Percent), funs(round(., 3)))
`funs()` was deprecated in dplyr 0.8.0.
Please use a list of either functions or lambdas:
# Simple named list:
list(mean = mean, median = median)
# Auto named with `tibble::lst()`:
tibble::lst(mean, median)
# Using lambdas
list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
bstats_salary <- bstats %>%
filter(yearID >= 1985) %>%
left_join(select(Salaries, playerID, yearID, teamID, salary),
by=c("playerID", "yearID", "teamID"))
bstats_salary[is.na(bstats_salary)] = 0
str(bstats_salary)
'data.frame': 46535 obs. of 32 variables:
$ playerID : chr "aasedo01" "abregjo01" "ackerji01" "adamsri02" ...
$ yearID : num 1985 1985 1985 1985 1985 ...
$ stint : num 1 1 1 1 1 1 1 1 1 1 ...
$ teamID : Factor w/ 149 levels "ALT","ANA","ARI",..: 5 35 134 117 33 102 94 134 134 134 ...
$ lgID : Factor w/ 7 levels "AA","AL","FL",..: 2 5 2 5 2 5 5 2 2 2 ...
$ G : num 54 6 61 54 54 91 22 12 36 14 ...
$ AB : num 0 9 0 121 0 165 36 20 0 34 ...
$ R : num 0 0 0 12 0 27 1 2 0 2 ...
$ H : num 0 0 0 23 0 46 10 4 0 4 ...
$ X2B : num 0 0 0 3 0 7 2 1 0 1 ...
$ X3B : num 0 0 0 1 0 3 0 0 0 0 ...
$ HR : num 0 0 0 2 0 6 0 1 0 0 ...
$ RBI : num 0 1 0 10 0 21 2 5 0 3 ...
$ SB : num 0 0 0 1 0 1 0 0 0 0 ...
$ CS : num 0 0 0 1 0 0 0 0 0 0 ...
$ BB : num 0 0 0 5 0 22 1 3 0 0 ...
$ SO : num 0 2 0 23 0 26 5 6 0 10 ...
$ IBB : num 0 0 0 3 0 5 0 0 0 0 ...
$ HBP : num 0 0 0 1 0 6 0 0 0 0 ...
$ SH : num 0 0 0 3 0 4 7 0 0 0 ...
$ SF : num 0 0 0 0 0 3 0 1 0 0 ...
$ GIDP : num 0 0 0 2 0 7 1 1 0 1 ...
$ BA : num 0 0 0 0.19 0 0.279 0.278 0.2 0 0.118 ...
$ PA : num 0 9 0 130 0 200 44 24 0 34 ...
$ TB : num 0 0 0 34 0 77 12 8 0 5 ...
$ SlugPct : num 0 0 0 0.281 0 0.467 0.333 0.4 0 0.147 ...
$ OBP : num 0 0 0 0.228 0 0.378 0.297 0.292 0 0.118 ...
$ OPS : num 0 0 0 0.509 0 0.845 0.63 0.692 0 0.265 ...
$ BABIP : num 0 0 0 0.219 0 0.294 0.323 0.214 0 0.167 ...
$ K_Percent : num 0 0.222 0 0.177 0 0.13 0.114 0.25 0 0.294 ...
$ BB_Percent: num 0 0 0 0.062 0 0.135 0.023 0.125 0 0 ...
$ salary : num 0 0 170000 0 147500 ...
bstats_sure <- bstats_salary %>%
filter(PA > 150) %>%
select(OPS, BABIP, K_Percent, BB_Percent, salary)
#Keep players with over 150 at bats. (We can change this value if necessary).
#Creating batting average variable.
batting1 <- bstats %>%
filter(AB >= 150)
bstats %>%
filter(playerID == "bogaexa01")
<<<<<<< HEAD
=======
>>>>>>> 0a71bd239790e75b2d56eb2aeea95cd144536745
Lessons 1 and 2 will just be parts of the overall project. Simple things like data manipulation, apply functions, boxplots, etc. This will be data preparation items and exploratory analysis.
b <- ggplot(batting1, aes(x = teamID, y = HR)) +
geom_boxplot(col = "black", aes(fill = teamID))
b
hitters1 <- batting1 %>%
filter(yearID < 1895) %>%
select(SlugPct)
hitters2 <- batting1 %>%
filter(yearID > 1894, yearID < 1921) %>%
select(SlugPct)
hitters3 <- batting1 %>%
filter(yearID > 1920, yearID < 1969) %>%
select(SlugPct)
hitters4 <- batting1 %>%
filter(yearID > 1969) %>%
select(SlugPct)
#Organizing 4 different datasets looking at slugging percentage for the following boxplots. All of these are somewhat different eras, with the most dramatic split being from before 1920 (pre-Babe Ruth) and after 1920 (during and post-Babe Ruth)
boxplot(hitters1,
main = "Slugging percentage from late 1871 - 1894",
ylab = "Slugging percentage",
col = "blue",
horizontal = TRUE)
boxplot(hitters2,
main = "Slugging percentage from 1895-1920",
ylab = "Slugging percentage",
col = "yellow",
horizontal = TRUE)
boxplot(hitters3,
main = "Slugging percentage from 1921-1968",
ylab = "Slugging percentage",
col = "red",
horizontal = TRUE)
boxplot(hitters4,
main = "Slugging percentage from 1969 - present",
ylab = "Slugging percentage",
col = "red",
horizontal = TRUE)
sapply(hitters1, mean, na.rm = T)
SlugPct
0.3456088
sapply(hitters2, mean, na.rm = T)
SlugPct
0.348923
sapply(hitters3, mean, na.rm = T)
SlugPct
0.3972127
sapply(hitters4, mean, na.rm = T)
SlugPct
0.4088045
#Notice that gigantic increase between hitters2 and hitters3
summary(hitters1)
SlugPct
Min. :0.1220
1st Qu.:0.2900
Median :0.3380
Mean :0.3456
3rd Qu.:0.3970
Max. :0.6960
summary(hitters2)
SlugPct
Min. :0.1480
1st Qu.:0.3003
Median :0.3430
Mean :0.3489
3rd Qu.:0.3910
Max. :0.8490
summary(hitters3)
SlugPct
Min. :0.1760
1st Qu.:0.3420
Median :0.3900
Mean :0.3972
3rd Qu.:0.4440
Max. :0.8460
summary(hitters4)
SlugPct
Min. :0.1730
1st Qu.:0.3540
Median :0.4040
Mean :0.4088
3rd Qu.:0.4580
Max. :0.8630
#Keep batting stats that we want for pairs.
batting_num <- bstats %>%
filter(PA >= 150) %>%
select("BA", 'OBP', 'SlugPct', "SO", "BB", "HR")
pairs(batting_num)
careerBatting <- na.omit(bstats)
careerBatting <- careerBatting %>%
select(playerID, BA, PA, SlugPct, OBP, SO, HR) %>%
group_by(playerID) %>%
summarise_all('mean')
careerBatting_num <- careerBatting %>%
filter(PA >= 150) %>%
select(BA, PA, SlugPct, OBP, SO, HR)
pairs(careerBatting_num)
corrmatrix <- cor(batting_num)
corrplot(corrmatrix, method = 'number') #Gives us correlation from pairs graph.
careerBatting_num1 <- careerBatting_num %>%
filter(PA > 500)
res <- batting_num %>% prcomp(scale = TRUE)
res
Standard deviations (1, .., p=6):
[1] 1.8624983 1.1955799 0.8163046 0.5272521 0.3234188 0.2296540
Rotation (n x k) = (6 x 6):
PC1 PC2 PC3 PC4 PC5 PC6
BA -0.3736490 0.53149382 0.20948811 -0.39409469 0.6134310 0.049063667
OBP -0.4412694 0.38795844 -0.30295510 -0.06651166 -0.5817204 0.469217735
SlugPct -0.4816546 0.08527252 0.45916589 0.20230952 -0.3441137 -0.624948649
SO -0.2974863 -0.61917967 0.04176753 -0.71554909 -0.1194610 0.009617743
BB -0.4043725 -0.14520286 -0.75150469 0.19652707 0.2909420 -0.356888661
HR -0.4262175 -0.39403532 0.29495049 0.49870136 0.2607132 0.509317820
loadings <- res$rotation
loadings
PC1 PC2 PC3 PC4 PC5 PC6
BA -0.3736490 0.53149382 0.20948811 -0.39409469 0.6134310 0.049063667
OBP -0.4412694 0.38795844 -0.30295510 -0.06651166 -0.5817204 0.469217735
SlugPct -0.4816546 0.08527252 0.45916589 0.20230952 -0.3441137 -0.624948649
SO -0.2974863 -0.61917967 0.04176753 -0.71554909 -0.1194610 0.009617743
BB -0.4043725 -0.14520286 -0.75150469 0.19652707 0.2909420 -0.356888661
HR -0.4262175 -0.39403532 0.29495049 0.49870136 0.2607132 0.509317820
score_mat <- res$x
score_mat
PC1 PC2 PC3 PC4 PC5 PC6
[1,] -2.416723e+00 4.560698e+00 1.443027e+00 -7.333664e-01 -4.234072e-01 -1.762980e-01
[2,] 1.219725e+00 1.849015e+00 8.201252e-01 -1.952915e-01 3.981469e-01 1.822513e-01
[3,] 1.474218e+00 7.482123e-01 9.141729e-01 8.396478e-01 -2.439173e-01 -5.416897e-01
[4,] 6.203888e-01 2.304795e+00 1.057469e+00 -2.640564e-01 2.662469e-01 3.072002e-02
[5,] 2.943669e+00 3.660013e-01 5.506029e-01 4.155864e-01 5.317398e-01 -3.286813e-02
[6,] 1.777087e+00 1.308355e+00 8.996221e-01 1.318660e-01 3.669270e-01 -1.200040e-01
[7,] 1.638294e+00 1.365860e+00 9.687942e-01 2.460817e-01 2.518894e-01 -2.846689e-01
[8,] 1.277859e+00 1.429171e+00 1.237701e+00 3.160011e-01 2.384888e-01 -3.187137e-01
[9,] 2.575716e+00 7.738965e-01 5.169331e-01 3.002796e-01 5.345135e-01 1.057655e-01
[10,] 1.406601e+00 1.626801e+00 1.019207e+00 6.878197e-02 3.105486e-01 -1.622986e-01
[11,] 2.314598e+00 1.032260e+00 3.284764e-01 1.875309e-01 4.737664e-01 3.377658e-01
[12,] -2.254698e+00 4.915125e+00 1.951355e+00 -1.215630e+00 1.700315e-01 1.075539e-01
[13,] 1.714346e+00 1.191014e+00 1.054853e+00 3.211466e-01 2.940316e-01 -2.832409e-01
[14,] 4.214521e+00 -5.515752e-01 3.050290e-01 7.087223e-01 7.976724e-01 3.980669e-02
[15,] 2.271122e+00 1.078424e+00 5.187903e-01 1.175299e-01 5.660790e-01 2.481744e-01
[16,] 7.655863e-01 2.041193e+00 1.014612e+00 1.844765e-02 5.616512e-02 -1.959685e-01
[17,] 1.520407e+00 1.432379e+00 1.091449e+00 2.042299e-01 2.334932e-01 -3.524935e-01
[18,] 1.490633e+00 1.291377e+00 6.930873e-01 3.470345e-01 6.483829e-02 -9.133176e-02
[19,] -1.993183e-01 3.201744e+00 1.191853e+00 -5.892269e-01 4.491340e-01 4.036008e-01
[20,] 3.770710e-01 2.434379e+00 7.232593e-01 -1.258865e-01 2.812814e-02 1.074989e-01
[21,] 2.922852e-01 2.390471e+00 1.332478e+00 -8.020927e-02 1.240978e-01 -2.261130e-01
[22,] 7.804409e-01 2.075181e+00 2.583384e-01 -8.693810e-02 3.394759e-03 4.109240e-01
[23,] 1.371866e+00 9.825359e-01 9.688047e-01 7.256051e-01 -2.492550e-01 -5.812617e-01
[24,] -2.382359e+00 4.714902e+00 1.062824e+00 -1.046896e+00 -3.540856e-02 3.003221e-01
[25,] 1.292267e+00 2.096809e+00 6.511071e-01 -4.066068e-01 6.834969e-01 5.933871e-01
[26,] -2.777582e+00 4.944257e+00 1.755964e+00 -1.083355e+00 -5.785753e-02 1.077638e-02
[27,] 1.190319e+00 1.795224e+00 7.511231e-01 -9.836268e-02 3.912924e-01 6.458790e-02
[28,] 2.138132e+00 7.503777e-01 5.815927e-01 2.363811e-01 3.601793e-01 5.370821e-02
[29,] 2.204313e+00 9.435450e-01 5.069415e-01 1.122817e-01 4.312682e-01 1.689245e-01
[30,] 2.534510e+00 9.701706e-01 6.262459e-01 7.652271e-02 7.558703e-01 2.656094e-01
[31,] 1.554444e+00 1.436907e+00 1.008768e+00 9.023136e-02 5.609106e-01 1.073257e-01
[32,] 1.819823e+00 1.362150e+00 6.227748e-01 4.177417e-02 4.474395e-01 1.647032e-01
[33,] 3.601595e+00 -6.392722e-01 4.603145e-01 4.955963e-01 5.099287e-01 -2.749128e-01
[34,] 2.645202e-01 2.481165e+00 9.207404e-01 -6.058332e-01 4.796779e-01 3.919094e-01
[35,] 3.421297e+00 1.147581e-02 4.038722e-01 5.460176e-01 6.062009e-01 1.169545e-02
[36,] 2.471482e-01 2.481274e+00 8.898948e-01 -5.414952e-01 3.639352e-01 1.736046e-01
[37,] 2.150533e+00 1.295666e+00 5.544389e-01 1.153722e-02 6.516006e-01 3.546918e-01
[38,] 3.006876e+00 8.594070e-03 8.192747e-01 5.894401e-01 4.886067e-01 -3.422591e-01
[39,] 1.854183e-02 2.884613e+00 1.174663e+00 -5.676400e-01 3.734326e-01 1.501689e-01
[40,] 1.393717e+00 1.754510e+00 7.957141e-01 -1.746703e-01 5.928169e-01 3.670016e-01
[41,] 2.260195e+00 6.986164e-01 1.076252e+00 4.301822e-01 3.090228e-01 -5.521915e-01
[42,] 5.987150e-02 2.642965e+00 1.483975e+00 -2.125679e-01 2.386474e-01 -2.420331e-01
[43,] 4.316153e-01 2.249363e+00 8.628850e-01 -2.927017e-01 3.697471e-01 1.355797e-01
[44,] 1.114305e+00 1.925993e+00 7.971480e-01 -1.876282e-01 4.587106e-01 1.283862e-01
[45,] 1.854586e-01 2.651860e+00 1.294539e+00 -4.079149e-01 5.125728e-01 2.486702e-01
[46,] 1.893873e-01 2.772122e+00 1.357466e+00 -4.803023e-01 5.973779e-01 3.360247e-01
[47,] 7.292379e-01 1.737208e+00 -2.609344e-01 -9.363625e-02 -4.368283e-02 4.379741e-01
[48,] 1.847550e+00 1.276145e+00 7.167369e-01 -2.892983e-02 5.097654e-01 1.042807e-01
[49,] 3.626601e+00 -8.547987e-02 2.693330e-01 4.713822e-01 7.559704e-01 1.828194e-01
[50,] 1.930785e+00 1.394545e+00 6.893944e-01 -6.183811e-02 6.532000e-01 2.401690e-01
[51,] -6.078538e-02 3.176777e+00 1.211221e+00 -6.759564e-01 4.986540e-01 3.306064e-01
[52,] 2.339012e+00 6.987610e-01 1.334819e-01 2.047737e-01 4.287504e-01 2.192671e-01
[53,] 9.842678e-01 2.225258e+00 1.096110e+00 -3.051193e-01 5.516512e-01 1.454291e-01
[54,] 1.455407e+00 1.672343e+00 9.417622e-01 -6.577560e-02 4.895289e-01 -1.238487e-03
[55,] 4.300761e+00 -9.084043e-01 2.143078e-01 9.553783e-01 6.595707e-01 -6.440636e-02
[56,] 4.710233e-01 1.423581e+00 1.662245e+00 4.631328e-01 2.677690e-01 -3.133157e-01
[57,] 1.649192e+00 1.423337e+00 1.162990e+00 7.234994e-02 5.394208e-01 -1.026508e-01
[58,] 2.768852e+00 1.478174e-01 8.899297e-01 5.451073e-01 3.915213e-01 -3.815478e-01
[59,] 2.099573e+00 1.023482e+00 9.221362e-01 1.216150e-01 6.103257e-01 1.984430e-02
[60,] 1.569637e+00 1.275754e+00 1.234741e+00 -7.314692e-03 4.644239e-01 -2.085609e-01
[61,] -1.498821e-01 3.080767e+00 1.334832e+00 -5.264526e-01 3.427686e-01 4.359203e-02
[62,] 2.568969e+00 9.856887e-01 6.948187e-01 8.865470e-02 7.938021e-01 2.354448e-01
[63,] 1.673655e+00 9.690939e-01 8.796657e-01 1.193615e-01 4.153881e-01 -3.410592e-02
[64,] -4.853341e-02 2.629401e+00 1.554874e+00 -1.519071e-01 2.415414e-01 -2.272155e-01
[65,] 2.410371e+00 7.924908e-01 2.061873e-01 2.658986e-01 4.473866e-01 2.400031e-01
[66,] 2.280413e+00 9.146890e-01 8.091815e-01 2.796219e-01 4.894963e-01 -1.584045e-01
[67,] -2.505357e-01 3.082187e+00 1.431683e+00 -4.758327e-01 3.200074e-01 8.442382e-02
[68,] -1.108737e+00 4.083915e+00 1.210488e+00 -1.059773e+00 4.674161e-01 5.476804e-01
[69,] 2.146383e+00 1.108095e+00 5.400660e-01 1.272634e-01 6.767289e-01 3.062165e-01
[70,] -3.332913e+00 5.029997e+00 1.529292e+00 -8.384134e-01 -1.736625e-01 -2.189377e-01
[71,] 1.917417e+00 1.118149e+00 1.174363e+00 2.949981e-01 3.501859e-01 -4.599910e-01
[72,] 2.510081e+00 4.956159e-01 4.078296e-01 5.372300e-01 4.147258e-01 -5.399929e-02
[73,] 1.842099e+00 1.296739e+00 6.460042e-01 -8.950588e-02 5.084560e-01 2.165966e-01
[74,] 1.946117e+00 1.088249e+00 9.342168e-01 1.443770e-01 5.197305e-01 -4.137615e-02
[75,] 4.010638e+00 -9.043144e-01 -9.277337e-02 5.143939e-01 5.910971e-01 1.226830e-01
[76,] 2.317837e+00 5.746190e-01 5.556606e-01 4.191115e-01 4.862667e-01 5.763530e-02
[77,] 3.074563e-01 2.534392e+00 1.401686e+00 -3.824038e-01 4.268553e-01 -1.232876e-03
[78,] 1.167005e+00 1.809348e+00 1.129422e+00 -4.721470e-02 4.501069e-01 -2.529551e-02
[79,] 1.632560e+00 1.468100e+00 9.918127e-01 1.531525e-03 4.588732e-01 -9.019922e-02
[80,] 2.741739e+00 5.302697e-01 3.604595e-01 2.144569e-01 5.750893e-01 2.100966e-01
[81,] 1.971533e+00 1.017793e+00 9.299388e-01 1.804371e-01 6.125357e-01 6.301301e-02
[82,] 2.721711e+00 2.680274e-01 2.327942e-02 2.907707e-01 3.877542e-01 2.068869e-01
[83,] 2.135547e-01 2.706228e+00 1.040692e+00 -4.148453e-01 3.578101e-01 1.113258e-01
[84,] 2.376955e+00 6.755834e-01 6.517979e-01 1.401077e-01 4.918054e-01 -3.603275e-02
[85,] 2.360438e+00 7.127484e-01 6.694583e-01 2.299599e-01 5.813277e-01 5.283155e-02
[86,] 4.128479e-02 2.776277e+00 1.415481e+00 -3.831971e-01 3.556296e-01 -7.593776e-03
[87,] -4.389638e-01 3.392092e+00 8.501625e-01 -7.063879e-01 3.920052e-01 4.741925e-01
[88,] 1.770532e+00 1.189871e+00 1.037435e+00 1.423073e-01 4.706187e-01 -1.423005e-01
[89,] 1.930187e+00 9.620627e-01 7.382488e-01 2.714791e-01 3.795639e-01 -9.154777e-02
[90,] 2.155863e+00 9.065004e-01 8.945369e-01 3.965488e-01 4.249762e-01 -1.818956e-01
[91,] 1.523907e-01 2.862417e+00 1.276905e+00 -4.713282e-01 3.751188e-01 9.063007e-02
[92,] 9.995124e-01 1.823088e+00 1.170911e+00 1.344113e-02 4.476761e-01 -1.903802e-02
[93,] 1.110998e+00 2.110401e+00 9.938894e-01 -2.666306e-01 5.675575e-01 1.759913e-01
[94,] 5.192421e-01 2.435532e+00 1.302713e+00 -3.091739e-01 4.550121e-01 7.071398e-02
[95,] 7.936573e-01 2.224945e+00 1.313504e+00 -3.269706e-01 4.517846e-01 -6.770236e-02
[96,] 8.403641e-01 2.256260e+00 1.178937e+00 -2.087666e-01 4.417500e-01 -3.706420e-02
[97,] 1.148718e+00 1.669228e+00 2.253819e-02 -2.459589e-01 2.848365e-01 5.532400e-01
[98,] 1.185487e+00 1.572026e+00 3.058708e-01 -8.272972e-02 2.987945e-01 1.777574e-01
[99,] 2.380042e+00 6.606109e-01 7.417901e-01 -1.160135e-01 6.060816e-01 9.513828e-02
[100,] 3.908648e+00 -6.824891e-01 -8.243134e-02 8.576197e-01 4.371080e-01 -5.428605e-02
[101,] 2.174089e+00 1.264951e+00 6.906613e-01 2.511012e-02 7.083800e-01 2.293995e-01
[102,] 1.618655e+00 1.738117e+00 8.890477e-01 -1.790972e-01 6.765123e-01 2.320327e-01
[103,] 1.964422e+00 1.146938e+00 4.015434e-01 6.802530e-02 4.616389e-01 2.143541e-01
[104,] -1.042642e+00 3.631385e+00 1.599374e+00 -6.467583e-01 2.788696e-01 1.557212e-01
[105,] -3.249017e-01 2.867747e+00 1.670345e+00 -2.033250e-01 3.077568e-01 -1.065481e-01
[106,] 1.571414e-01 2.589093e+00 1.581907e+00 -1.917588e-01 1.676164e-01 -4.610063e-01
[107,] 3.075156e+00 -2.184416e-01 1.588215e-01 3.996932e-01 3.985028e-01 1.536229e-01
[108,] 9.464263e-01 2.334568e+00 9.881629e-01 -4.426485e-01 6.270796e-01 3.568946e-01
[109,] -6.112114e-01 3.051586e+00 8.299057e-01 -3.936699e-01 1.653944e-01 1.061102e-01
[110,] -2.891211e-01 3.207652e+00 1.054573e+00 -6.829069e-01 3.826751e-01 3.203025e-01
[111,] 1.958538e+00 1.147893e+00 4.340068e-01 1.390249e-01 5.734960e-01 2.825576e-01
[112,] 8.340147e-02 2.055687e+00 1.394886e+00 1.639048e-01 1.670183e-01 -3.031638e-01
[113,] 1.948431e+00 1.402737e+00 5.745996e-01 -8.495996e-02 6.732018e-01 3.401546e-01
[114,] 1.331906e+00 1.598962e+00 1.169111e+00 7.239512e-02 4.155062e-01 -1.724547e-01
[115,] 2.089271e+00 8.270733e-01 9.166172e-01 3.041473e-01 5.516338e-01 -6.825203e-02
[116,] 6.652700e-01 2.346548e+00 1.092434e+00 -3.183514e-01 5.451402e-01 2.086985e-01
[117,] 2.182842e+00 9.434587e-01 7.752312e-01 3.309184e-01 5.282321e-01 -5.022710e-02
[118,] 5.013390e-01 2.533947e+00 1.210166e+00 -3.879506e-01 4.280099e-01 5.845775e-02
[119,] 2.398461e+00 5.670894e-01 6.248191e-01 2.635547e-01 5.100397e-01 1.409121e-03
[120,] -1.188190e+00 3.900769e+00 1.819585e+00 -8.126923e-01 3.070541e-01 4.488749e-02
[121,] 2.227782e+00 1.044966e+00 9.926182e-01 1.859686e-01 5.704792e-01 -1.576408e-01
[122,] 6.265958e-01 2.374716e+00 7.351219e-01 -2.704773e-01 3.419254e-01 2.442308e-01
[123,] -1.527206e+00 3.792398e+00 1.498283e+00 -5.826627e-01 2.284069e-01 1.143657e-01
[124,] 1.949358e+00 7.736311e-01 4.591902e-01 4.113739e-01 3.819606e-01 2.007939e-02
[125,] 9.546985e-01 1.712587e+00 1.279208e+00 6.244127e-02 3.509531e-01 -2.032127e-01
[126,] 4.175919e+00 -6.018102e-01 3.044794e-01 5.929268e-01 8.285393e-01 6.653842e-02
[127,] 3.073146e+00 2.745347e-01 7.352831e-01 4.814719e-01 5.952238e-01 -1.992386e-01
[128,] 1.774820e+00 1.289509e+00 6.669919e-01 -1.641910e-02 5.258865e-01 6.206168e-02
[129,] 5.375029e-01 2.595559e+00 9.792746e-01 -4.384009e-01 5.128877e-01 2.760494e-01
[130,] 1.598670e+00 1.663336e+00 8.385291e-01 -1.327358e-01 5.606892e-01 2.006034e-01
[131,] 8.938875e-02 2.782653e+00 9.391116e-01 -4.292095e-01 3.250307e-01 1.562155e-01
[132,] 4.491808e+00 -1.287511e+00 3.988355e-01 6.978455e-01 6.750664e-01 -2.784215e-01
[133,] 3.277880e+00 -6.584339e-02 6.927470e-01 4.305486e-01 6.061131e-01 -2.223229e-01
[134,] 1.638951e+00 1.263582e+00 1.160531e+00 2.927038e-01 3.091054e-01 -3.312571e-01
[135,] 3.296838e+00 2.304245e-01 3.456176e-01 2.485582e-01 8.001902e-01 3.026274e-01
[136,] 3.596579e+00 -2.680050e-01 5.883358e-01 5.459397e-01 6.462194e-01 -1.850761e-01
[137,] 2.454556e+00 6.178760e-01 7.683971e-01 3.774372e-01 4.885282e-01 -9.600194e-02
[138,] 3.253124e+00 2.323717e-01 4.309048e-01 3.185377e-01 7.544745e-01 1.730422e-01
[139,] 1.970733e+00 1.018884e+00 1.053743e+00 1.557218e-01 5.270654e-01 -1.585225e-01
[140,] 1.732010e+00 1.305955e+00 9.698273e-01 4.194498e-02 5.546218e-01 9.694220e-03
[141,] 2.225008e+00 7.309146e-01 6.902748e-01 3.084738e-02 4.856930e-01 -3.255164e-02
[142,] 1.097909e+00 1.474785e+00 1.536971e+00 3.588217e-01 2.872577e-01 -4.562505e-01
[143,] -2.784413e-01 2.874435e+00 1.589095e+00 -2.842018e-01 7.354918e-02 -4.702583e-01
[144,] 3.543882e+00 -3.235219e-01 6.006674e-01 3.081664e-01 7.149754e-01 -8.164553e-02
[145,] 2.129066e+00 8.346168e-01 6.714373e-01 2.156418e-01 6.212377e-01 1.639034e-01
[146,] 2.778449e+00 4.690608e-01 5.346695e-01 3.530826e-01 5.238112e-01 -1.691195e-02
[147,] 3.318391e+00 -4.019898e-02 1.329822e-01 4.554767e-01 5.679152e-01 1.507273e-01
[148,] 1.961713e+00 1.228795e+00 6.940204e-01 -2.294416e-01 6.582778e-01 2.966505e-01
[149,] 6.883506e-01 2.237150e+00 1.111198e+00 -1.600857e-01 3.408241e-01 -1.358345e-01
[150,] 2.674405e+00 6.525884e-01 5.475495e-01 3.712414e-02 7.278302e-01 2.354851e-01
[151,] 3.556008e+00 -2.199490e-01 4.501220e-01 4.201832e-01 7.066383e-01 -9.600422e-03
[152,] 7.393203e-01 2.392853e+00 1.287022e+00 -3.290517e-01 4.924273e-01 1.329802e-02
[153,] 1.280523e+00 2.026696e+00 8.107106e-01 -2.868386e-01 6.492289e-01 3.371576e-01
[154,] 2.170348e+00 1.131354e+00 6.813562e-01 -6.638855e-02 6.787592e-01 2.330541e-01
[155,] 1.849284e+00 9.991822e-01 5.051238e-01 -5.257306e-02 4.897946e-01 2.653435e-01
[156,] 1.870168e+00 1.316393e+00 6.006543e-01 -4.324355e-02 5.540260e-01 2.129298e-01
[157,] 1.371166e+00 1.486129e+00 1.384976e+00 2.654090e-01 3.073133e-01 -4.855113e-01
[158,] 7.454876e-01 2.311675e+00 9.709886e-01 -3.586801e-01 4.676539e-01 1.803653e-01
[159,] 3.140966e+00 -1.743634e-01 3.273602e-01 3.639460e-01 5.143199e-01 -9.705433e-02
[160,] 2.111663e+00 1.092447e+00 5.673255e-01 4.419451e-02 5.741471e-01 1.651516e-01
[161,] 2.319455e+00 8.086177e-01 7.777198e-01 3.865898e-01 5.497539e-01 -1.071935e-01
[162,] 1.545696e+00 1.581049e+00 7.992864e-01 -1.243311e-01 5.378649e-01 1.187910e-01
[163,] 3.098880e-01 2.733051e+00 1.364830e+00 -3.830859e-01 4.013289e-01 -2.839183e-02
[164,] 1.053480e+00 2.035085e+00 1.117748e+00 -2.013981e-01 4.824652e-01 -2.245388e-02
[165,] 3.807886e+00 -5.048465e-01 5.647328e-01 7.574360e-01 5.701567e-01 -3.459630e-01
[166,] 2.855763e+00 5.998251e-01 4.149993e-01 3.069515e-01 6.704102e-01 1.883833e-01
[ reached getOption("max.print") -- omitted 35229 rows ]
get_eig(res)
<<<<<<< HEAD
=======
>>>>>>> 0a71bd239790e75b2d56eb2aeea95cd144536745
get_eig(res) %>%
ggplot(aes(x = 1:6, y = cumulative.variance.percent)) +
geom_line() +
geom_point() +
geom_hline(yintercept = 80) +
xlab("Principal Component") +
ylab("Proportion of Variance Explained") +
ggtitle("Scree Plot of Principal Component for Batting Statistics")
2 Principal Components: PC1 and PC2
fviz_screeplot(res, main = "Scree Plot")
Can Identify an elbow in 3.
res %>%
fviz_pca_var(axes = c(1,2),
col.var = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE
)
<<<<<<< HEAD
#NOT COMPLETE!!!!! This was just a test, bstats is way too big.
bstats_best <- bstats %>%
filter(PA >= 600)
eu_dist <- get_dist(careerBatting_num1, method = 'euclidean')
hc_complete <- hclust(eu_dist, method = 'complete')
plot(hc_complete)
res_test <- careerBatting_num1 %>% kmeans(7)
str(res_test)
List of 9
$ cluster : int [1:313] 1 5 7 2 7 5 6 6 7 1 ...
$ centers : num [1:7, 1:6] 0.284 0.292 0.269 0.292 0.274 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:7] "1" "2" "3" "4" ...
.. ..$ : chr [1:6] "BA" "PA" "SlugPct" "OBP" ...
$ totss : num 651407
$ withinss : num [1:7] 11692 9601 15264 7699 18968 ...
$ tot.withinss: num 108892
$ betweenss : num 542514
$ size : int [1:7] 21 37 29 27 56 91 52
$ iter : int 3
$ ifault : int 0
- attr(*, "class")= chr "kmeans"
distance <- get_dist(careerBatting_num1, method = "euclidean")
sil <- silhouette(x = res_test$cluster, dist = distance)
summary(sil)
Silhouette of 313 units in 7 clusters from silhouette.default(x = res_test$cluster, dist = distance) :
Cluster sizes and average silhouette widths:
21 37 29 27 56 91 52
0.4003255 0.3618949 0.3404187 0.4165085 0.3245905 0.4092127 0.2819611
Individual silhouette widths:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-0.06123 0.22568 0.39691 0.36100 0.50462 0.62451
sil %>% head()
cluster neighbor sil_width
[1,] 1 4 0.31068125
[2,] 5 3 0.32305899
[3,] 7 1 0.08784106
[4,] 2 7 0.12656126
[5,] 7 2 0.40706851
[6,] 5 7 0.16599027
fviz_silhouette(sil)
<<<<<<< HEAD
=======
>>>>>>> 0a71bd239790e75b2d56eb2aeea95cd144536745
fviz_nbclust(careerBatting_num1, hcut, hc_method = "complete", hc_metric = "euclidean", method = "wss")
##This is to test other values of K for the silhouette method.
res_test1 <- careerBatting_num1 %>% kmeans(10 )
str(res_test1)
List of 9
$ cluster : int [1:313] 10 4 6 3 3 3 8 2 6 10 ...
$ centers : num [1:10, 1:6] 0.287 0.278 0.285 0.273 0.269 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:10] "1" "2" "3" "4" ...
.. ..$ : chr [1:6] "BA" "PA" "SlugPct" "OBP" ...
$ totss : num 651407
$ withinss : num [1:10] 5561 3421 7533 8158 17850 ...
$ tot.withinss: num 82705
$ betweenss : num 568701
$ size : int [1:10] 47 26 32 35 31 29 28 40 25 20
$ iter : int 4
$ ifault : int 0
- attr(*, "class")= chr "kmeans"
distance <- get_dist(careerBatting_num1, method="euclidean")
sil <- silhouette(x = res_test1$cluster, dist = distance)
summary(sil)
Silhouette of 313 units in 10 clusters from silhouette.default(x = res_test1$cluster, dist = distance) :
Cluster sizes and average silhouette widths:
47 26 32 35 31 29 28 40 25 20
0.4123528 0.2235291 0.3095730 0.3662893 0.2348064 0.2444163 0.3084443 0.2360885 0.4169711 0.3504059
Individual silhouette widths:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-0.07461 0.18155 0.32364 0.31245 0.44921 0.62394
sil %>% head()
cluster neighbor sil_width
[1,] 10 6 0.25143884
[2,] 4 8 0.56107068
[3,] 6 10 0.21097598
[4,] 3 2 0.37769870
[5,] 3 6 0.09750601
[6,] 3 4 0.41195414
fviz_silhouette(sil)
<<<<<<< HEAD
=======
>>>>>>> 0a71bd239790e75b2d56eb2aeea95cd144536745
Linear Regression comparing team payroll and win rate.
teams = as.data.table(Teams)
teams = teams[, .(yearID,
lgID = as.character(lgID),
teamID = as.character(teamID),
franchID = as.character(franchID),
Rank, G, W, L, R, ERA, SO,
WinPercent = W/(W+L))]
salaries = as.data.table(Salaries)
salaries = salaries[, c("lgID", "teamID", "salary1M") :=
list(as.character(lgID), as.character(teamID), salary / 1e6L)]
payroll = salaries[, .(payroll = sum(salary1M)), by=.(teamID, yearID)]
teamPayroll = merge(teams, payroll, by = c("teamID", "yearID"))
ggplot(data = teamPayroll, aes(x = payroll, y = WinPercent)) + geom_point() + labs(x = "Payroll (in millions)", y = "Win Percentage") +
geom_smooth(method = lm, se = FALSE)
mod_lm <- lm(data = teamPayroll, WinPercent~payroll)
mod_lm
Call:
lm(formula = WinPercent ~ payroll, data = teamPayroll)
Coefficients:
(Intercept) payroll
0.4796007 0.0003396
summary(mod_lm)
Call:
lm(formula = WinPercent ~ payroll, data = teamPayroll)
Residuals:
Min 1Q Median 3Q Max
-0.230866 -0.048237 -0.000954 0.049584 0.211074
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.4796007 0.0037895 126.561 < 2e-16 ***
payroll 0.0003396 0.0000512 6.633 5.61e-11 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.06714 on 916 degrees of freedom
Multiple R-squared: 0.04583, Adjusted R-squared: 0.04479
F-statistic: 44 on 1 and 916 DF, p-value: 5.611e-11
payroll_pred <- teamPayroll %>%
add_predictions(mod_lm)
payroll_pred %>%
filter(yearID >= 2010) %>%
arrange(desc(pred)) %>%
head(25)
<<<<<<< HEAD
=======
>>>>>>> 0a71bd239790e75b2d56eb2aeea95cd144536745
payroll_pred %>%
filter(yearID >= 2010) %>%
arrange(desc(WinPercent)) %>%
head(25)
<<<<<<< HEAD
=======
>>>>>>> 0a71bd239790e75b2d56eb2aeea95cd144536745
Only five teams are in the top 25 of both payroll and win percentage in the 2010s. These teams are the 2011 Phillies, 2011 Yankees, 2010 Yankees, 2012 Yankees, and 2016 Rangers. This shows that spending the most money doesn’t automatically mean you are getting the best product on the field. ## Simple Linear Regression
bstats_salary <- bstats_salary %>%
filter(PA >= 100) %>%
filter(salary > 500000)
lm_mod <- lm(salary ~ H, HR, data = bstats_salary_21century)
summary(lm_mod)
Call:
lm(formula = salary ~ H, data = bstats_salary_21century, subset = HR)
Residuals:
Min 1Q Median 3Q Max
-4454703 -1184411 -175489 774007 14030406
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -175015.7 82216.4 -2.129 0.0333 *
H 39604.4 661.7 59.854 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2299000 on 3982 degrees of freedom
Multiple R-squared: 0.4736, Adjusted R-squared: 0.4735
F-statistic: 3583 on 1 and 3982 DF, p-value: < 2.2e-16
lm_mod_prd <- bstats_salary_21century %>% add_predictions(lm_mod)
lm_mod_prd
<<<<<<< HEAD
=======
>>>>>>> 0a71bd239790e75b2d56eb2aeea95cd144536745
full_model <- lm(salary ~., data = bstats_salary_21century)
summary(full_model)
Call:
lm(formula = salary ~ ., data = bstats_salary_21century)
Residuals:
Min 1Q Median 3Q Max
-15323368 -1127665 0 1254066 12871792
Coefficients: (3 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.286e+09 5.422e+07 -23.712 < 2e-16 ***
playerIDabreujo02 -7.317e+06 2.046e+06 -3.576 0.000354 ***
playerIDackledu01 -1.297e+07 1.806e+06 -7.180 8.68e-13 ***
playerIDadamecr01 -1.070e+07 3.117e+06 -3.432 0.000608 ***
playerIDadamsda02 -1.347e+07 3.085e+06 -4.365 1.31e-05 ***
playerIDadamsma01 -1.383e+07 1.996e+06 -6.927 5.22e-12 ***
playerIDadducji02 -1.145e+07 3.093e+06 -3.700 0.000219 ***
playerIDadriaeh01 -1.127e+07 3.099e+06 -3.637 0.000281 ***
playerIDagbaybe01 -4.126e+06 3.115e+06 -1.325 0.185362
playerIDahmedni01 -1.344e+07 2.334e+06 -5.756 9.44e-09 ***
playerIDalfoned01 -3.760e+06 1.812e+06 -2.075 0.038056 *
playerIDalicelu01 -4.490e+06 3.108e+06 -1.445 0.148667
playerIDalomaro01 -2.833e+06 1.983e+06 -1.429 0.153183
playerIDalomasa02 -5.953e+06 1.834e+06 -3.245 0.001186 **
playerIDalonsyo01 -1.311e+07 1.672e+06 -7.839 6.20e-15 ***
playerIDaloumo01 -2.724e+06 1.589e+06 -1.715 0.086476 .
playerIDaltheaa01 -1.513e+07 3.080e+06 -4.913 9.45e-07 ***
playerIDaltuvjo01 -1.197e+07 2.039e+06 -5.873 4.75e-09 ***
playerIDalvarpe01 -1.057e+07 1.612e+06 -6.560 6.29e-11 ***
playerIDamarial01 -1.192e+07 2.333e+06 -5.109 3.44e-07 ***
playerIDamezaal01 -5.945e+06 3.118e+06 -1.907 0.056677 .
playerIDanderga01 -4.053e+06 1.445e+06 -2.805 0.005058 **
playerIDanderma02 -5.572e+06 1.793e+06 -3.108 0.001900 **
playerIDandinro01 -1.112e+07 3.099e+06 -3.588 0.000338 ***
playerIDandruel01 -4.611e+06 1.710e+06 -2.696 0.007049 **
playerIDankieri01 -9.386e+06 1.668e+06 -5.627 1.99e-08 ***
playerIDaokino01 -1.109e+07 1.696e+06 -6.539 7.21e-11 ***
playerIDarciaos01 -1.335e+07 2.354e+06 -5.669 1.57e-08 ***
playerIDarenano01 -1.197e+07 2.365e+06 -5.062 4.40e-07 ***
playerIDarencjp01 -1.314e+07 2.327e+06 -5.647 1.78e-08 ***
playerIDariasjo01 -9.841e+06 2.351e+06 -4.187 2.91e-05 ***
playerIDascheco01 -1.435e+07 2.292e+06 -6.263 4.30e-10 ***
playerIDatkinga01 -5.761e+06 1.983e+06 -2.904 0.003706 **
playerIDaurilri01 -6.197e+06 1.478e+06 -4.194 2.81e-05 ***
playerIDausmubr01 -6.107e+06 1.571e+06 -3.887 0.000104 ***
playerIDavilaal01 -1.239e+07 1.689e+06 -7.333 2.85e-13 ***
playerIDavilemi01 -1.088e+07 1.596e+06 -6.819 1.10e-11 ***
playerIDaybarer01 -8.724e+06 1.537e+06 -5.676 1.50e-08 ***
playerIDaybarwi01 -9.228e+06 2.318e+06 -3.980 7.04e-05 ***
playerIDbaergca01 -5.648e+06 3.107e+06 -1.818 0.069184 .
playerIDbaezja01 -1.390e+07 3.113e+06 -4.467 8.21e-06 ***
playerIDbagweje01 5.219e+06 1.834e+06 2.846 0.004454 **
playerIDbakerje03 -1.032e+07 1.803e+06 -5.725 1.14e-08 ***
playerIDbakerjo01 -1.191e+07 2.307e+06 -5.164 2.57e-07 ***
playerIDbakopa01 -6.578e+06 1.591e+06 -4.136 3.64e-05 ***
playerIDbaldero01 -7.053e+06 2.328e+06 -3.030 0.002468 **
playerIDbarajro01 -8.668e+06 1.511e+06 -5.737 1.06e-08 ***
playerIDbardjo01 -8.674e+06 1.973e+06 -4.396 1.14e-05 ***
playerIDbarmecl01 -8.677e+06 1.532e+06 -5.665 1.60e-08 ***
playerIDbarnebr02 -9.376e+06 2.362e+06 -3.969 7.37e-05 ***
playerIDbarneda01 -1.377e+07 1.990e+06 -6.919 5.50e-12 ***
playerIDbarnhtu01 -1.529e+07 3.098e+06 -4.934 8.48e-07 ***
playerIDbarremi01 -6.867e+06 1.544e+06 -4.447 9.02e-06 ***
playerIDbartlja01 -7.030e+06 1.972e+06 -3.565 0.000369 ***
playerIDbartoda02 -1.163e+07 2.327e+06 -4.998 6.10e-07 ***
playerIDbatisto01 -5.905e+06 1.823e+06 -3.240 0.001210 **
playerIDbautida01 -4.170e+06 2.006e+06 -2.079 0.037740 *
playerIDbautijo02 -5.166e+06 1.436e+06 -3.596 0.000328 ***
playerIDbaxtemi01 -1.178e+07 3.113e+06 -3.783 0.000158 ***
playerIDbayja01 -4.753e+06 1.444e+06 -3.292 0.001007 **
playerIDbeckhgo01 -1.248e+07 1.686e+06 -7.403 1.71e-13 ***
playerIDbeckhti01 -1.235e+07 2.342e+06 -5.273 1.43e-07 ***
playerIDbellda01 -6.356e+06 1.658e+06 -3.834 0.000129 ***
playerIDbellhma01 -6.670e+06 2.310e+06 -2.887 0.003912 **
playerIDbelliro01 -7.044e+06 1.463e+06 -4.815 1.55e-06 ***
playerIDbellja01 -4.870e+06 3.139e+06 -1.551 0.120888
playerIDbeltbr01 -1.150e+07 1.804e+06 -6.374 2.12e-10 ***
playerIDbeltrad01 -6.044e+05 1.299e+06 -0.465 0.641671
playerIDbeltrca01 1.127e+06 1.254e+06 0.899 0.368930
playerIDbenarma01 -5.642e+04 3.108e+06 -0.018 0.985517
playerIDbenjami01 6.736e+05 3.145e+06 0.214 0.830387
playerIDbennega01 -7.187e+06 1.671e+06 -4.301 1.75e-05 ***
playerIDbergda01 -5.325e+06 2.329e+06 -2.287 0.022273 *
playerIDberkmla01 -9.742e+05 1.408e+06 -0.692 0.488961
playerIDbernaro01 -9.117e+06 3.101e+06 -2.940 0.003311 **
playerIDberroan01 -7.616e+06 3.122e+06 -2.440 0.014763 *
playerIDbetanyu01 -9.590e+06 1.496e+06 -6.409 1.69e-10 ***
playerIDbetemwi01 -1.031e+07 1.972e+06 -5.229 1.81e-07 ***
playerIDbethach01 -1.291e+07 2.309e+06 -5.591 2.46e-08 ***
playerIDbettsmo01 -1.516e+07 2.330e+06 -6.507 8.92e-11 ***
playerIDbiggicr01 -4.264e+06 1.688e+06 -2.525 0.011606 *
playerIDblackch02 -1.079e+07 2.045e+06 -5.273 1.43e-07 ***
playerIDblakeca01 -6.557e+06 1.515e+06 -4.327 1.56e-05 ***
playerIDblaloha01 -7.270e+06 1.601e+06 -4.541 5.82e-06 ***
playerIDblancan01 -1.143e+07 1.971e+06 -5.801 7.25e-09 ***
playerIDblancgr01 -9.776e+06 1.688e+06 -5.792 7.64e-09 ***
playerIDblanche01 -7.770e+06 1.429e+06 -5.439 5.79e-08 ***
playerIDblankky01 -1.220e+07 3.090e+06 -3.949 8.02e-05 ***
playerIDbloomwi01 -8.476e+06 1.436e+06 -5.901 4.00e-09 ***
playerIDblumge01 -7.772e+06 1.486e+06 -5.231 1.80e-07 ***
playerIDboescbr01 -1.398e+07 3.090e+06 -4.524 6.29e-06 ***
playerIDbogaexa01 -1.652e+07 1.994e+06 -8.284 < 2e-16 ***
playerIDbondsba01 2.672e+06 1.995e+06 1.339 0.180526
playerIDbonifem01 -8.885e+06 1.981e+06 -4.485 7.57e-06 ***
playerIDbooneaa01 -5.571e+06 1.580e+06 -3.525 0.000430 ***
playerIDboonebr01 -1.095e+06 1.796e+06 -0.610 0.542164
playerIDborboju01 -8.529e+06 3.107e+06 -2.745 0.006085 **
playerIDbordimi01 -3.660e+06 2.305e+06 -1.588 0.112403
playerIDbourgja01 -1.279e+07 3.101e+06 -4.124 3.82e-05 ***
playerIDbourjpe01 -1.110e+07 1.800e+06 -6.164 8.02e-10 ***
playerIDbourju01 -1.447e+07 3.142e+06 -4.607 4.26e-06 ***
playerIDbournmi01 -3.358e+06 1.585e+06 -2.119 0.034192 *
playerIDbradlja02 -1.536e+07 1.990e+06 -7.719 1.58e-14 ***
playerIDbradlmi01 -5.041e+06 1.487e+06 -3.390 0.000708 ***
playerIDbrantmi02 -1.174e+07 2.002e+06 -5.864 4.99e-09 ***
playerIDbranyru01 -9.118e+06 1.804e+06 -5.054 4.58e-07 ***
playerIDbraunry02 -3.101e+06 1.498e+06 -2.071 0.038485 *
playerIDbrousbe01 -5.782e+06 2.312e+06 -2.500 0.012457 *
playerIDbrowndo01 -1.319e+07 2.285e+06 -5.771 8.66e-09 ***
playerIDbrownem01 -7.854e+06 1.973e+06 -3.981 7.03e-05 ***
playerIDbrowntr01 -1.372e+07 3.093e+06 -4.434 9.58e-06 ***
playerIDbroxtke01 -1.136e+07 3.131e+06 -3.629 0.000289 ***
playerIDbruceja01 -5.797e+06 1.624e+06 -3.569 0.000363 ***
playerIDbrunter01 -6.586e+06 2.310e+06 -2.851 0.004382 **
playerIDbryankr01 -1.643e+07 3.118e+06 -5.270 1.46e-07 ***
playerIDbuckjo01 -8.518e+06 1.580e+06 -5.392 7.49e-08 ***
playerIDbucktr01 -1.052e+07 3.089e+06 -3.404 0.000672 ***
playerIDburkech01 -6.977e+06 3.119e+06 -2.237 0.025357 *
playerIDburksel01 5.976e+02 2.327e+06 0.000 0.999795
playerIDburnije01 -2.642e+06 1.667e+06 -1.585 0.113118
playerIDburnsbi02 -1.402e+07 3.112e+06 -4.505 6.87e-06 ***
playerIDburrepa01 -4.267e+06 1.390e+06 -3.070 0.002159 **
playerIDburriem01 -9.879e+06 3.101e+06 -3.186 0.001458 **
playerIDburrose01 -6.060e+06 3.104e+06 -1.952 0.050988 .
playerIDbuterdr01 -1.337e+07 2.306e+06 -5.798 7.41e-09 ***
playerIDbutlebi03 -7.861e+06 1.600e+06 -4.914 9.41e-07 ***
playerIDbuxtoby01 -1.206e+07 3.122e+06 -3.864 0.000114 ***
playerIDbyrdma01 -8.936e+06 1.455e+06 -6.143 9.13e-10 ***
playerIDbyrneer01 -3.386e+06 1.680e+06 -2.015 0.043958 *
playerIDcabreas01 -7.864e+06 1.590e+06 -4.946 7.98e-07 ***
playerIDcabreev01 -1.025e+07 1.987e+06 -5.159 2.65e-07 ***
playerIDcabrejo02 -6.823e+06 3.097e+06 -2.203 0.027646 *
playerIDcabreme01 -6.932e+06 1.460e+06 -4.747 2.16e-06 ***
playerIDcabremi01 3.633e+06 1.434e+06 2.533 0.011350 *
playerIDcabreor01 -6.306e+06 1.397e+06 -4.513 6.63e-06 ***
playerIDcainlo01 -1.093e+07 1.816e+06 -6.020 1.95e-09 ***
playerIDcairomi01 -7.048e+06 1.438e+06 -4.903 9.95e-07 ***
playerIDcalhoko01 -1.648e+07 1.984e+06 -8.305 < 2e-16 ***
playerIDcallaal01 -1.228e+07 1.658e+06 -7.408 1.65e-13 ***
playerIDcamermi01 -2.582e+06 1.361e+06 -1.897 0.057910 .
playerIDcampber01 -1.480e+07 3.094e+06 -4.784 1.80e-06 ***
playerIDcanoro01 -9.624e+05 1.448e+06 -0.665 0.506314
playerIDcantujo01 -8.545e+06 1.989e+06 -4.295 1.80e-05 ***
playerIDcarpema01 -1.379e+07 1.819e+06 -7.581 4.52e-14 ***
playerIDcarpmi01 -1.311e+07 2.326e+06 -5.636 1.89e-08 ***
playerIDcarreez01 -1.222e+07 3.107e+06 -3.933 8.59e-05 ***
playerIDcarroja01 -7.805e+06 1.474e+06 -5.294 1.28e-07 ***
playerIDcartech02 -1.340e+07 2.091e+06 -6.407 1.72e-10 ***
playerIDcasalcu01 -1.346e+07 3.104e+06 -4.335 1.50e-05 ***
playerIDcaseyse01 -4.578e+06 1.536e+06 -2.981 0.002895 **
playerIDcasilal01 -8.736e+06 2.329e+06 -3.751 0.000179 ***
playerIDcasteni01 -1.579e+07 2.330e+06 -6.779 1.45e-11 ***
playerIDcastijo02 -7.970e+06 2.309e+06 -3.452 0.000564 ***
playerIDcastilu01 -3.560e+06 1.461e+06 -2.437 0.014870 *
playerIDcastiru01 -4.103e+06 3.088e+06 -1.329 0.184082
playerIDcastivi02 -5.225e+06 1.681e+06 -3.108 0.001902 **
playerIDcastiwe01 -1.413e+07 1.989e+06 -7.103 1.51e-12 ***
playerIDcastrja01 -1.195e+07 2.174e+06 -5.496 4.21e-08 ***
playerIDcastrju01 -5.864e+06 1.681e+06 -3.489 0.000492 ***
playerIDcastrra01 -1.048e+07 1.808e+06 -5.795 7.53e-09 ***
playerIDcastrst01 -1.075e+07 1.691e+06 -6.355 2.39e-10 ***
playerIDcatalfr01 -5.428e+06 1.535e+06 -3.537 0.000411 ***
playerIDcedenro01 -2.235e+06 1.979e+06 -1.130 0.258746
playerIDcedenro02 -1.033e+07 1.680e+06 -6.148 8.85e-10 ***
playerIDcervefr01 -1.367e+07 1.983e+06 -6.895 6.52e-12 ***
playerIDcespeyo01 -1.871e+06 1.673e+06 -1.118 0.263579
playerIDchaveen01 -7.759e+06 1.812e+06 -4.282 1.91e-05 ***
playerIDchaveer01 -5.147e+06 1.385e+06 -3.716 0.000206 ***
playerIDchiriro01 -1.436e+07 2.007e+06 -7.155 1.04e-12 ***
playerIDchiselo01 -1.187e+07 2.006e+06 -5.916 3.66e-09 ***
playerIDchoicmi01 -1.371e+07 3.093e+06 -4.431 9.70e-06 ***
playerIDchoiji01 -1.608e+07 3.096e+06 -5.195 2.18e-07 ***
playerIDchoosh01 -3.947e+06 1.582e+06 -2.495 0.012630 *
playerIDchurcry01 -9.241e+06 1.970e+06 -4.690 2.85e-06 ***
playerIDcintral01 -6.459e+06 2.327e+06 -2.775 0.005548 **
playerIDcirilje01 -3.909e+06 1.808e+06 -2.162 0.030685 *
playerIDclarkbr02 -6.416e+06 2.364e+06 -2.715 0.006673 **
playerIDclarkto02 -5.633e+06 1.601e+06 -3.518 0.000442 ***
playerIDclaytro01 -5.493e+06 1.597e+06 -3.439 0.000591 ***
playerIDclevest01 -1.445e+07 3.106e+06 -4.651 3.45e-06 ***
playerIDcoghlch01 -1.103e+07 1.979e+06 -5.571 2.75e-08 ***
playerIDcolabch01 -1.251e+07 3.100e+06 -4.037 5.54e-05 ***
playerIDcolbrgr01 -4.381e+06 3.122e+06 -1.403 0.160729
playerIDcollity01 -1.500e+07 3.095e+06 -4.847 1.32e-06 ***
playerIDcolonch01 -1.341e+07 2.336e+06 -5.741 1.03e-08 ***
playerIDcolvity01 -1.169e+07 3.103e+06 -3.768 0.000168 ***
playerIDconfomi01 -1.604e+07 3.090e+06 -5.193 2.21e-07 ***
playerIDcongeha01 -1.385e+07 1.986e+06 -6.976 3.69e-12 ***
playerIDconinje01 -5.492e+06 1.592e+06 -3.450 0.000567 ***
playerIDcoomero01 -6.133e+06 3.094e+06 -1.982 0.047572 *
playerIDcoraal01 -7.491e+06 1.430e+06 -5.237 1.74e-07 ***
playerIDcordewi01 -9.184e+06 3.186e+06 -2.883 0.003972 **
playerIDcordoma01 -4.748e+06 3.099e+06 -1.532 0.125592
playerIDcorpoca01 -1.390e+07 2.353e+06 -5.910 3.80e-09 ***
playerIDcorreca01 -1.652e+07 3.195e+06 -5.170 2.48e-07 ***
playerIDcounscr01 -5.881e+06 1.412e+06 -4.166 3.19e-05 ***
playerIDcowgico01 -1.388e+07 3.088e+06 -4.494 7.26e-06 ***
playerIDcozarza01 -1.343e+07 2.015e+06 -6.665 3.13e-11 ***
playerIDcraigal01 -1.232e+07 2.430e+06 -5.070 4.22e-07 ***
playerIDcrawfbr01 -1.262e+07 1.821e+06 -6.932 5.04e-12 ***
[ reached getOption("max.print") -- omitted 858 rows ]
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2889000 on 3072 degrees of freedom
Multiple R-squared: 0.7694, Adjusted R-squared: 0.6903
F-statistic: 9.724 on 1054 and 3072 DF, p-value: < 2.2e-16
full_model_pred <- bstats_sure %>% add_predictions(full_model)
full_model_pred
<<<<<<< HEAD
=======
>>>>>>> 0a71bd239790e75b2d56eb2aeea95cd144536745
adv_stat_mod <- lm(salary ~ OPS, data = bstats_salary_21century)
summary(adv_stat_mod)
Call:
lm(formula = salary ~ OPS, data = bstats_salary_21century)
Residuals:
Min 1Q Median 3Q Max
-10237320 -3222583 -1313128 1912953 26166519
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -7241235 498661 -14.52 <2e-16 ***
OPS 16617138 664556 25.00 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 4837000 on 4125 degrees of freedom
Multiple R-squared: 0.1316, Adjusted R-squared: 0.1314
F-statistic: 625.2 on 1 and 4125 DF, p-value: < 2.2e-16
#including 2002 and up because salary becomes higher
bstats_salary_21century <- bstats_salary %>%
filter(yearID >= 2002)
bstats_salary_21century %>% head(10)
<<<<<<< HEAD
=======
>>>>>>> 0a71bd239790e75b2d56eb2aeea95cd144536745
# setting seed to generate a reproducible random sampling
set.seed(123)
# defining training control as cross-validation and value of K equal to 10
train_control <- trainControl(method = "cv",
number = 10)
# training the model
model <- train(salary ~ OBP, data = bstats_salary_21century,
method = "lm",
trControl = train_control)
print(model)
Linear Regression
4127 samples
1 predictor
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 3712, 3714, 3713, 3715, 3715, 3715, ...
Resampling results:
RMSE Rsquared MAE
4903905 0.1069253 3691779
Tuning parameter 'intercept' was held constant at a value of TRUE
bstats_salary_numvars <- bstats_salary_21century %>%
select(c(6:32))
#Correlation mapping
#making correlation heat map
corr_numeric <- round(cor(bstats_salary_numvars), 1)
#plot to visualize the correlations
ggcorrplot(corr_numeric,
type = "lower",
lab = TRUE,
lab_size = 2,
colors = c("tomato2", "white", "springgreen3"),
title="Correlogram of batting Data",
ggtheme=theme_bw)
regfit.full = regsubsets(salary ~., data = bstats_salary_numvars, nvmax = 13, method="exhaustive")
3 linear dependencies found
Reordering variables and trying again:
summary(regfit.full)
Subset selection object
Call: regsubsets.formula(salary ~ ., data = bstats_salary_numvars,
nvmax = 13, method = "exhaustive")
26 Variables (and intercept)
Forced in Forced out
G FALSE FALSE
AB FALSE FALSE
R FALSE FALSE
H FALSE FALSE
X2B FALSE FALSE
X3B FALSE FALSE
HR FALSE FALSE
RBI FALSE FALSE
SB FALSE FALSE
CS FALSE FALSE
BB FALSE FALSE
SO FALSE FALSE
IBB FALSE FALSE
HBP FALSE FALSE
SH FALSE FALSE
SF FALSE FALSE
GIDP FALSE FALSE
BA FALSE FALSE
SlugPct FALSE FALSE
OBP FALSE FALSE
BABIP FALSE FALSE
K_Percent FALSE FALSE
BB_Percent FALSE FALSE
PA FALSE FALSE
TB FALSE FALSE
OPS FALSE FALSE
1 subsets of each size up to 14
Selection Algorithm: exhaustive
G AB R H X2B X3B HR RBI SB CS BB SO IBB HBP SH SF GIDP BA PA TB SlugPct OBP OPS BABIP
1 ( 1 ) " " " " " " " " " " " " " " "*" " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " "
2 ( 1 ) " " " " " " " " " " " " " " "*" " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " "
3 ( 1 ) "*" " " " " " " " " " " " " " " " " " " " " " " " " " " "*" " " " " " " "*" " " " " " " " " " "
4 ( 1 ) "*" " " " " " " " " " " " " " " " " " " " " " " "*" " " "*" " " " " " " "*" " " " " " " " " " "
5 ( 1 ) "*" " " " " " " " " "*" " " " " " " " " " " " " "*" " " "*" " " " " " " "*" " " " " " " " " " "
6 ( 1 ) "*" " " " " " " "*" "*" " " " " " " " " " " " " "*" " " "*" " " " " " " "*" " " " " " " " " " "
7 ( 1 ) "*" " " " " " " "*" "*" " " " " " " " " " " " " "*" " " "*" " " " " " " "*" " " " " " " "*" " "
8 ( 1 ) "*" " " " " " " "*" "*" " " " " " " " " " " " " "*" " " "*" " " "*" " " "*" " " " " " " "*" " "
9 ( 1 ) "*" " " " " " " "*" "*" " " " " " " " " " " " " "*" " " "*" " " "*" " " "*" " " "*" " " " " " "
10 ( 1 ) "*" " " " " " " "*" "*" " " " " " " " " " " " " "*" " " "*" " " "*" " " "*" " " "*" " " " " " "
11 ( 1 ) "*" " " " " " " "*" "*" " " " " "*" "*" " " " " "*" " " "*" " " "*" " " "*" " " "*" " " " " " "
12 ( 1 ) "*" " " " " " " "*" "*" " " " " "*" "*" " " " " "*" " " "*" " " "*" " " "*" " " "*" " " " " " "
13 ( 1 ) "*" " " " " " " "*" "*" " " "*" "*" "*" " " "*" "*" " " "*" " " "*" " " "*" " " "*" " " " " " "
14 ( 1 ) "*" "*" " " " " "*" "*" "*" "*" "*" "*" "*" " " "*" " " "*" " " "*" " " " " " " " " " " "*" "*"
K_Percent BB_Percent
1 ( 1 ) " " " "
2 ( 1 ) " " "*"
3 ( 1 ) " " " "
4 ( 1 ) " " " "
5 ( 1 ) " " " "
6 ( 1 ) " " " "
7 ( 1 ) " " " "
8 ( 1 ) " " " "
9 ( 1 ) " " "*"
10 ( 1 ) "*" "*"
11 ( 1 ) " " "*"
12 ( 1 ) "*" "*"
13 ( 1 ) " " "*"
14 ( 1 ) " " " "
summary(regfit.full)$rsq
[1] 0.1735921 0.1997069 0.2379362 0.2627538 0.2763364 0.2820003 0.2891514 0.2937692 0.2962186 0.2979275 0.2997391
[12] 0.3013596 0.3020423 0.3031258
plot(summary(regfit.full)$rsq)
reg.summary <- summary(regfit.full) #get the summary
par(mfrow=c(2,2))
#rss plot - NOT USEFUL
plot(reg.summary$rss ,xlab="Number of Variables ",ylab="RSS",type="l")
#adjr2 plot
plot(reg.summary$adjr2 ,xlab="Number of Variables ", ylab="Adjusted RSq",type="l")
max_adjr2 <- which.max(reg.summary$adjr2)
points(max_adjr2,reg.summary$adjr2[max_adjr2], col="red",cex=2,pch=20)
# AIC criterion (Cp) to minimize
plot(reg.summary$cp ,xlab="Number of Variables ",ylab="Cp", type='l')
min_cp <- which.min(reg.summary$cp )
points(min_cp, reg.summary$cp[min_cp],col="red",cex=2,pch=20)
# BIC criterion to minimize
plot(reg.summary$bic ,xlab="Number of Variables ",ylab="BIC",type='l')
min_bic <- which.min(reg.summary$bic)
points(min_bic,reg.summary$bic[min_bic],col="red",cex=2,pch=20)
#Forward stepwise selection
regfit.fwd = regsubsets(salary ~. , data=bstats_salary_numvars, nvmax=13, method ="forward")
3 linear dependencies found
Reordering variables and trying again:
summary(regfit.fwd)
Subset selection object
Call: regsubsets.formula(salary ~ ., data = bstats_salary_numvars,
nvmax = 13, method = "forward")
26 Variables (and intercept)
Forced in Forced out
G FALSE FALSE
AB FALSE FALSE
R FALSE FALSE
H FALSE FALSE
X2B FALSE FALSE
X3B FALSE FALSE
HR FALSE FALSE
RBI FALSE FALSE
SB FALSE FALSE
CS FALSE FALSE
BB FALSE FALSE
SO FALSE FALSE
IBB FALSE FALSE
HBP FALSE FALSE
SH FALSE FALSE
SF FALSE FALSE
GIDP FALSE FALSE
BA FALSE FALSE
SlugPct FALSE FALSE
OBP FALSE FALSE
BABIP FALSE FALSE
K_Percent FALSE FALSE
BB_Percent FALSE FALSE
PA FALSE FALSE
TB FALSE FALSE
OPS FALSE FALSE
1 subsets of each size up to 14
Selection Algorithm: forward
G AB R H X2B X3B HR RBI SB CS BB SO IBB HBP SH SF GIDP BA PA TB SlugPct OBP OPS BABIP
1 ( 1 ) " " " " " " " " " " " " " " "*" " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " "
2 ( 1 ) " " " " " " " " " " " " " " "*" " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " "
3 ( 1 ) "*" " " " " " " " " " " " " "*" " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " "
4 ( 1 ) "*" "*" " " " " " " " " " " "*" " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " "
5 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " "
6 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " " " " " " " " " "*" " " " " " " " " " " " " " " " " " "
7 ( 1 ) "*" "*" " " " " "*" "*" " " "*" " " " " " " " " " " " " "*" " " " " " " " " " " " " " " " " " "
8 ( 1 ) "*" "*" " " " " "*" "*" " " "*" " " " " " " " " " " " " "*" " " "*" " " " " " " " " " " " " " "
9 ( 1 ) "*" "*" " " " " "*" "*" " " "*" " " " " " " " " "*" " " "*" " " "*" " " " " " " " " " " " " " "
10 ( 1 ) "*" "*" " " " " "*" "*" " " "*" " " " " " " " " "*" " " "*" " " "*" " " " " " " " " " " " " " "
11 ( 1 ) "*" "*" "*" " " "*" "*" " " "*" " " " " " " " " "*" " " "*" " " "*" " " " " " " " " " " " " " "
12 ( 1 ) "*" "*" "*" " " "*" "*" " " "*" " " "*" " " " " "*" " " "*" " " "*" " " " " " " " " " " " " " "
13 ( 1 ) "*" "*" "*" " " "*" "*" " " "*" "*" "*" " " " " "*" " " "*" " " "*" " " " " " " " " " " " " " "
14 ( 1 ) "*" "*" "*" " " "*" "*" " " "*" "*" "*" "*" " " "*" " " "*" " " "*" " " " " " " " " " " " " " "
K_Percent BB_Percent
1 ( 1 ) " " " "
2 ( 1 ) " " "*"
3 ( 1 ) " " "*"
4 ( 1 ) " " "*"
5 ( 1 ) " " "*"
6 ( 1 ) " " "*"
7 ( 1 ) " " "*"
8 ( 1 ) " " "*"
9 ( 1 ) " " "*"
10 ( 1 ) "*" "*"
11 ( 1 ) "*" "*"
12 ( 1 ) "*" "*"
13 ( 1 ) "*" "*"
14 ( 1 ) "*" "*"
reg.summary <- summary(regfit.fwd) #get the summary
par(mfrow=c(2,2))
#rss plot - NOT USEFUL
plot(reg.summary$rss ,xlab="Number of Variables ",ylab="RSS",type="l")
#adjr2 plot
plot(reg.summary$adjr2 ,xlab="Number of Variables ", ylab="Adjusted RSq",type="l")
max_adjr2 <- which.max(reg.summary$adjr2)
points(max_adjr2,reg.summary$adjr2[max_adjr2], col="red",cex=2,pch=20)
# AIC criterion (Cp) to minimize
plot(reg.summary$cp ,xlab="Number of Variables ",ylab="Cp", type='l')
min_cp <- which.min(reg.summary$cp )
points(min_cp, reg.summary$cp[min_cp],col="red",cex=2,pch=20)
# BIC criterion to minimize
plot(reg.summary$bic ,xlab="Number of Variables ",ylab="BIC",type='l')
min_bic <- which.min(reg.summary$bic)
points(min_bic,reg.summary$bic[min_bic],col="red",cex=2,pch=20)
#Backwards stepwise selection
regfit.bwd = regsubsets(salary ~. , data=bstats_salary_numvars,nvmax=13, method ="backward")
3 linear dependencies found
Reordering variables and trying again:
summary(regfit.bwd)
Subset selection object
Call: regsubsets.formula(salary ~ ., data = bstats_salary_numvars,
nvmax = 13, method = "backward")
26 Variables (and intercept)
Forced in Forced out
G FALSE FALSE
AB FALSE FALSE
R FALSE FALSE
H FALSE FALSE
X2B FALSE FALSE
X3B FALSE FALSE
HR FALSE FALSE
RBI FALSE FALSE
SB FALSE FALSE
CS FALSE FALSE
BB FALSE FALSE
SO FALSE FALSE
IBB FALSE FALSE
HBP FALSE FALSE
SH FALSE FALSE
SF FALSE FALSE
GIDP FALSE FALSE
BA FALSE FALSE
SlugPct FALSE FALSE
OBP FALSE FALSE
BABIP FALSE FALSE
K_Percent FALSE FALSE
BB_Percent FALSE FALSE
PA FALSE FALSE
TB FALSE FALSE
OPS FALSE FALSE
1 subsets of each size up to 14
Selection Algorithm: backward
G AB R H X2B X3B HR RBI SB CS BB SO IBB HBP SH SF GIDP BA PA TB SlugPct OBP OPS BABIP
1 ( 1 ) " " " " " " " " " " " " " " " " " " " " "*" " " " " " " " " " " " " " " " " " " " " " " " " " "
2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " "*" " " " " " " " " " " " " " " " " " " " " " " " " " "
3 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " "*" " " " " " " " " " " " " " " " " " " " " " " " " " "
4 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " "*" " " " " " " "*" " " " " " " " " " " " " " " " " " "
5 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" " " " " " " "*" " " " " " " " " " " " " " " " " " "
6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" " " "*" " " "*" " " " " " " " " " " " " " " " " " "
7 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" " " "*" " " "*" " " "*" " " " " " " " " " " " " " "
8 ( 1 ) "*" "*" " " " " "*" "*" " " " " " " " " "*" " " "*" " " "*" " " "*" " " " " " " " " " " " " " "
9 ( 1 ) "*" "*" " " " " "*" "*" " " " " " " " " "*" " " "*" " " "*" " " "*" " " " " " " "*" " " " " " "
10 ( 1 ) "*" "*" " " " " "*" "*" " " " " " " "*" "*" " " "*" " " "*" " " "*" " " " " " " "*" " " " " " "
11 ( 1 ) "*" "*" " " " " "*" "*" " " " " "*" "*" "*" " " "*" " " "*" " " "*" " " " " " " "*" " " " " " "
12 ( 1 ) "*" "*" " " " " "*" "*" " " "*" "*" "*" "*" " " "*" " " "*" " " "*" " " " " " " "*" " " " " " "
13 ( 1 ) "*" "*" " " " " "*" "*" "*" "*" "*" "*" "*" " " "*" " " "*" " " "*" " " " " " " "*" " " " " " "
14 ( 1 ) "*" "*" " " " " "*" "*" "*" "*" "*" "*" "*" " " "*" " " "*" " " "*" " " " " " " "*" " " " " "*"
K_Percent BB_Percent
1 ( 1 ) " " " "
2 ( 1 ) " " " "
3 ( 1 ) " " " "
4 ( 1 ) " " " "
5 ( 1 ) " " " "
6 ( 1 ) " " " "
7 ( 1 ) " " " "
8 ( 1 ) " " " "
9 ( 1 ) " " " "
10 ( 1 ) " " " "
11 ( 1 ) " " " "
12 ( 1 ) " " " "
13 ( 1 ) " " " "
14 ( 1 ) " " " "
reg.summary <- summary(regfit.bwd) #get the summary
par(mfrow=c(2,2))
#rss plot - NOT USEFUL
plot(reg.summary$rss ,xlab="Number of Variables ",ylab="RSS",type="l")
#adjr2 plot
plot(reg.summary$adjr2 ,xlab="Number of Variables ", ylab="Adjusted RSq",type="l")
max_adjr2 <- which.max(reg.summary$adjr2)
points(max_adjr2, reg.summary$adjr2[max_adjr2], col="red", cex=2, pch=20)
# AIC criterion (Cp) to minimize
plot(reg.summary$cp ,xlab="Number of Variables ",ylab="Cp", type='l')
min_cp <- which.min(reg.summary$cp )
points(min_cp, reg.summary$cp[min_cp], col="red", cex=2, pch=20)
# BIC criterion to minimize
plot(reg.summary$bic, xlab="Number of Variables ", ylab="BIC", type='l')
min_bic <- which.min(reg.summary$bic)
points(min_bic, reg.summary$bic[min_bic], col="red", cex=2, pch=20)
#ridge regression
# getting the predictors
x_var <- bstats_salary_numvars %>% select(-salary) %>% as.matrix()
# getting the independent variable
y_var <- bstats_salary_numvars[,"salary"]
ridge <- glmnet(x_var, y_var, alpha=0)
summary(ridge)
Length Class Mode
a0 100 -none- numeric
beta 2600 dgCMatrix S4
df 100 -none- numeric
dim 2 -none- numeric
lambda 100 -none- numeric
dev.ratio 100 -none- numeric
nulldev 1 -none- numeric
npasses 1 -none- numeric
jerr 1 -none- numeric
offset 1 -none- logical
call 4 -none- call
nobs 1 -none- numeric
cv_ridge <- cv.glmnet(x_var, y_var, alpha = 0)
cv_ridge
Call: cv.glmnet(x = x_var, y = y_var, alpha = 0)
Measure: Mean-Squared Error
Lambda Index Measure SE Nonzero
min 216221 100 1.919e+13 5.986e+11 26
1se 1051399 83 1.976e+13 6.505e+11 26
plot(cv_ridge)
cv_ridge$lambda.min
[1] 216221.5
cv_ridge$lambda.1se
[1] 1051399
lbs_fun <- function(fit, offset_x=1, ...) {
L <- length(fit$lambda)
x <- log(fit$lambda[L]) + offset_x
y <- fit$beta[ ,L]
labs <- names(y)
text(x, y, labels=labs, ...)
}
plot(ridge, xvar = "lambda", label=T)
lbs_fun(ridge) # add namnes
abline(v = log(cv_ridge$lambda.min), col = "red", lty=2) #lambda.min
abline(v = log(cv_ridge$lambda.1se), col="blue", lty=2) #lambda.1se
min_ridge <- glmnet(x_var, y_var, alpha=0, lambda = cv_ridge$lambda.min)
coef(min_ridge)
27 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) 3250721.298
G -58444.324
AB 6525.990
R 18080.066
H 13278.220
X2B -69523.006
X3B -242522.624
HR -18087.206
RBI 13495.290
SB 36983.072
CS -135994.183
BB 21055.638
SO 5972.853
IBB 85143.276
HBP -15492.692
SH -240184.732
SF 25292.413
GIDP 96266.785
BA 4307633.443
PA 5100.777
TB 1191.386
SlugPct 1694540.697
OBP 2293477.173
OPS 1307566.357
BABIP -9978065.658
K_Percent -2726576.999
BB_Percent 7042279.208
# Make predictions on the test data
predictions <- min_ridge %>% predict(x_var) %>% as.vector()
# Model performance metrics
data.frame(
RMSE = RMSE(predictions, y_var),
Rsquare = R2(predictions, y_var)
)
<<<<<<< HEAD
=======
>>>>>>> 0a71bd239790e75b2d56eb2aeea95cd144536745
# Lasso
# getting the predictors
x_var <- bstats_salary_numvars %>% select(-salary) %>% as.matrix()
# getting the independent variable
y_var <- bstats_salary_numvars[,"salary"]
lasso <- glmnet(x_var, y_var, alpha=1)
summary(lasso)
Length Class Mode
a0 93 -none- numeric
beta 2418 dgCMatrix S4
df 93 -none- numeric
dim 2 -none- numeric
lambda 93 -none- numeric
dev.ratio 93 -none- numeric
nulldev 1 -none- numeric
npasses 1 -none- numeric
jerr 1 -none- numeric
offset 1 -none- logical
call 4 -none- call
nobs 1 -none- numeric
cv_lasso <- cv.glmnet(x_var, y_var, alpha = 1)
cv_lasso
Call: cv.glmnet(x = x_var, y = y_var, alpha = 1)
Measure: Mean-Squared Error
Lambda Index Measure SE Nonzero
min 4245 68 1.899e+13 6.269e+11 20
1se 91445 35 1.956e+13 6.738e+11 17
plot(cv_lasso)
lbs_fun <- function(fit, offset_x=1, ...) {
L <- length(fit$lambda)
x <- log(fit$lambda[L])+ offset_x
y <- fit$beta[, L]
labs <- names(y)
text(x, y, labels=labs, ...)
}
plot(lasso, xvar = "lambda", label=T)
lbs_fun(lasso)
abline(v=log(cv_lasso$lambda.min), col = "red", lty=2)
abline(v=log(cv_lasso$lambda.1se), col="blue", lty=2)
min_lasso <- glmnet(x_var, y_var, alpha=1, lambda = cv_lasso$lambda.min)
coef(min_lasso)
27 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) 2264368.199
G -87629.654
AB 24232.612
R 18771.351
H .
X2B -107459.897
X3B -280673.479
HR -57743.509
RBI 20863.414
SB 33703.191
CS -150540.856
BB 28671.666
SO .
IBB 99202.779
HBP -8281.984
SH -238325.444
SF 19061.264
GIDP 87321.414
BA .
PA 1860.331
TB -1044.547
SlugPct 4865101.358
OBP .
OPS 2765165.903
BABIP -6624249.070
K_Percent -1340986.372
BB_Percent 7490373.836
se_lasso <- glmnet(x_var, y_var, alpha=1, lambda = cv_lasso$lambda.1se)
coef(se_lasso)
27 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) 3242070.189
G -50386.031
AB 9548.952
R 12602.471
H .
X2B -8338.434
X3B -168194.535
HR 9621.891
RBI 15185.753
SB 4485.584
CS -39631.941
BB 29452.924
SO .
IBB 83501.590
HBP .
SH -218512.738
SF .
GIDP 86598.503
BA .
PA 1625.492
TB .
SlugPct .
OBP .
OPS 477409.136
BABIP .
K_Percent -2789571.448
BB_Percent 5662137.375
# Make predictions on the test data
predictions <- min_lasso %>% predict(x_var) %>% as.vector()
# Model performance metrics
data.frame(
RMSE = RMSE(predictions, y_var),
Rsquare = R2(predictions, y_var)
)
<<<<<<< HEAD
=======
>>>>>>> 0a71bd239790e75b2d56eb2aeea95cd144536745
franchise <- c(`ANA` = "LAA", `ARI` = "ARI", `ATL` = "ATL",
`BAL` = "BAL", `BOS` = "BOS", `CAL` = "LAA",
`CHA` = "CHA", `CHN` = "CHN", `CIN` = "CIN",
`CLE` = "CLE", `COL` = "COL", `DET` = "DET",
`FLO` = "MIA", `HOU` = "HOU", `KCA` = "KCA",
`LAA` = "LAA", `LAN` = "LAN", `MIA` = "MIA",
`MIL` = "MIL", `MIN` = "MIN", `ML4` = "MIL",
`MON` = "WAS", `NYA` = "NYA", `NYM` = "NYN",
`NYN` = "NYN", `OAK` = "OAK", `PHI` = "PHI",
`PIT` = "PIT", `SDN` = "SDN", `SEA` = "SEA",
`SFG` = "SFN", `SFN` = "SFN", `SLN` = "SLN",
`TBA` = "TBA", `TEX` = "TEX", `TOR` = "TOR",
`WAS` = "WAS")
Salaries$franchise <- unname(franchise[Salaries$teamID])
avg_team_salaries <- Salaries %>%
group_by(yearID, franchise, lgID) %>%
summarise(salary = mean(salary)/1e6) %>%
filter(!(franchise == "CLE" & lgID == "NL"))
`summarise()` has grouped output by 'yearID', 'franchise'. You can override using the `.groups` argument.
ggplot(avg_team_salaries,
aes(x = yearID, y = salary, group = factor(franchise))) +
geom_path() +
labs(x = "Year", y = "Average team salary (millions USD)")
<<<<<<< HEAD
ggplot(Salaries, aes(x = factor(yearID), y = salary/1e5)) +
geom_boxplot(fill = "lightblue", outlier.size = 1) +
labs(x = "Year", y = "Salary (per $1,000,000)") +
coord_flip()
<<<<<<< HEAD
avg_team_salaries1 <- Salaries %>%
group_by(yearID, franchise, lgID) %>%
summarise(salary= mean(salary)/1e6) %>%
filter(!(franchise == "CLE" & lgID == "NL")) %>%
filter(yearID >= 2002)
`summarise()` has grouped output by 'yearID', 'franchise'. You can override using the `.groups` argument.
avg_team_salaries1 %>%
arrange(desc(salary))
<<<<<<< HEAD
=======
>>>>>>> 0a71bd239790e75b2d56eb2aeea95cd144536745
ggplot(avg_team_salaries1, aes(x = franchise, y = salary)) +
geom_bar(stat = "identity") +
labs(x = "Team", y = "Salary (per $100,000)")
ggplot(avg_team_salaries1, aes(x = franchise, y = salary, fill = franchise)) +
geom_boxplot(outlier.size = 1) +
labs(x = "Year", y = "Average Team Salary Since 2002 (per $10,000,000)") +
coord_flip()
<<<<<<< HEAD